library("data.table")
library("haven")
library("arrow")
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

lib2021 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2021\\Format parquet\\"
lib2022 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2022\\"
lib2023 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2023\\"

#-----------------------------------------#
# Function ####
#-----------------------------------------#
  psid_sen <- function (y,out="rds") {

        liby <- get(paste("lib",y,sep=""))
        files_y <- list.files(path = liby)
        files_y <- files_y[grep(paste("post_",y,sep=""),files_y)]
        end <- length(files_y)
        
        dads_y <- NULL
        print(liby)
        
        for (i in c(1:end)){
                fff <- read_parquet(paste(liby,files_y[i],sep=""),
                                    col_select = c(ident_s,siren,nic,s_brut))
                fff <- fff[s_brut>0,]
                dads_y <- rbind(dads_y,fff,fill=T)
                print(files_y[i])
                print(nrow(fff))
                rm(fff)
        }
        
        y_1 <- y-1      
        
        liby_1 <- get(paste("lib",y_1,sep=""))
        files_y_1 <- list.files(path = liby_1)
        files_y_1 <- files_y_1[grep(paste("post_",y_1,sep=""),files_y_1)]
        end <- length(files_y_1)
        
        dads_y_1 <- NULL
        print(liby_1)
        
        for (i in c(1:end)){
          fff <- read_parquet(paste(liby_1,files_y_1[i],sep=""),
                              col_select = c(ident_s,siren,nic,s_brut))
          fff <- fff[s_brut>0,]
          dads_y_1 <- rbind(dads_y_1,fff,fill=T)
          print(files_y_1[i])
          print(nrow(fff))
          rm(fff)
        }
        
        print(nrow(dads_y))
        dads_y <- merge(dads_y,dads_y_1,by=c("ident_s","siren","nic"),all.x=T,suffixes=c("","_1"))
        print(nrow(dads_y))
        colnames(dads_y)
        rm(dads_y_1)
        
        psid_y <- read_sas(paste("C:\\Users\\Public\\Documents\\pseudo_id\\psid_",y,".sas7bdat",sep=""))
        colnames(psid_y) <- tolower(colnames(psid_y))
        
        dads_y <- merge(dads_y,psid_y,by="ident_s",all.x=T)
        rm(psid_y)
        dads_y[,ident_all:=ifelse(is.na(ident_all)==T | ident_all=="",
                                   ident_s*100+as.numeric(substr(y,3,4)),
                                         ident_all)]
        
        if (y==2022){
          dads_y_1 <- as.data.table(read_sas(paste("pseudo_id_seniority/psid_sen_",y_1,".sas7bdat",sep="")))
          colnames(dads_y_1) <- tolower(colnames(dads_y_1))
          
          
          
        }else {
          if (out=="dta") {
              dads_y_1 <- as.data.table(read_dta(paste("pseudo_id_seniority/psid_sen_",y_1,".dta",sep="")))
              colnames(dads_y_1) <- tolower(colnames(dads_y_1))} else {
              dads_y_1 <- as.data.table(readRDS(paste("pseudo_id_seniority/psid_sen_",y_1,".rds",sep="")))
              }
          
        }
        
        
        dads_y_1[,ident_s:=NULL]
        
        dads_y_1_est <- unique(dads_y_1[,list(ident_all,siren,nic,est_entry)])
        dads_y_1_est[,min_est_entry:=min(est_entry),by=list(ident_all,siren,nic)]
        dads_y_1_est[,est_entry:=NULL]
        dads_y_1_est <- unique(dads_y_1_est)
        dads_y_1_est[,est_entry:=min_est_entry]
        dads_y_1_est[,min_est_entry:=NULL]
        
        print(nrow(dads_y))
        dads_y <- merge(dads_y,dads_y_1_est,by=c("ident_all","siren","nic"),all.x=T)
        print(nrow(dads_y))
        rm(dads_y_1_est)
        dads_y[,est_entry:=ifelse(is.na(est_entry)==T,y-(1-is.na(s_brut_1)*1),est_entry)]

        dads_y_1_firm <- unique(dads_y_1[,list(ident_all,siren,firm_entry)])
        rm(dads_y_1)
        
        dads_y_1_firm[,min_firm_entry:=min(firm_entry),by=list(ident_all,siren)]
        dads_y_1_firm[,firm_entry:=NULL]
        dads_y_1_firm <- unique(dads_y_1_firm)
        dads_y_1_firm[,firm_entry:=min_firm_entry]
        dads_y_1_firm[,min_firm_entry:=NULL]
        
        print(nrow(dads_y))
        dads_y <- merge(dads_y,dads_y_1_firm,by=c("ident_all","siren"),all.x=T)
        print(nrow(dads_y))
        
        
        dads_y[,firm_entry:=ifelse(is.na(firm_entry)==T,y-(1-is.na(s_brut_1)*1),firm_entry)]
        
        dads_y[,c("s_brut","s_brut_1"):=NULL]
          
        if (out=="dta") {
                    write_dta(dads_y,paste("pseudo_id_seniority/psid_sen_",y,".dta",sep=""), version= 14)
        }else{      saveRDS(dads_y,paste("pseudo_id_seniority/psid_sen_",y,".rds",sep="")) }
        rm(dads_y,dads_y_1_firm)
  }
    

#-----------------------------------------#
# Use ####
#-----------------------------------------#

 
psid_sen(2022,out="rds")
psid_sen(2023,out="rds")
  
